%%capture
from __future__ import division
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
from plotly.subplots import make_subplots
users=pd.read_csv("users.csv",delimiter=";")
movies=pd.read_csv("movies.csv",delimiter=";",encoding = "ISO-8859-1")
ratings=pd.read_csv("ratings.csv",sep='\t', lineterminator='\r',encoding = "UTF-16 LE")
ratings['date']= pd.to_datetime(ratings['date'])
users['memberfor']= pd.to_datetime(users['memberfor'],format='%d/%m/%Y %H:%M')
ratings.head()
#print(ratings.info())
#print(ratings.isnull().sum())
#print(ratings.iloc[-1])
ratings.drop([8196077],inplace=True)
#print(ratings.isnull().sum())
ratings.isnull().sum()
ratings['movieid']=pd.to_numeric(ratings['movieid'],downcast='integer')
ratings['userid']=pd.to_numeric(ratings['userid'],downcast='integer')
print(ratings.info())
print('duplicates: ',ratings.duplicated().sum())
#print(users.head())
print(users.info())
#print(users.isnull().sum())
print('duplicates: ',users.duplicated().sum())
#print(movies.head())
print(movies.info())
#print(movies.isnull().sum())
print('duplicates: ',movies.duplicated().sum())
plot_data = [
go.Histogram(
x=users['age'],
)
]
plot_layout = go.Layout(
title='Age Distriution',
xaxis_title="Age",
yaxis_title="Number of Users"
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_data=users
tx_data['memberforYearMonthDay'] = tx_data['memberfor'].map(lambda date: date )
newUsersmonth = tx_data.groupby(['memberforYearMonthDay'])['userid'].nunique().reset_index()
plot_data = [
go.Bar(
x=newUsersmonth['memberforYearMonthDay'],
y=newUsersmonth['userid'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis_title="Number of Users",
title='New users'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_data=ratings
tx_data['date'] = tx_data['date'].map(lambda date: 100*date.year + date.month )
newcomentsmonth = tx_data.groupby(['date'])['userid'].count().reset_index()
plot_data = [
go.Bar(
x=newcomentsmonth['date'],
y=newcomentsmonth['userid'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis_title="Number of ratings",
title='New monthly ratings'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_data=ratings
tx_data=pd.merge(tx_data,movies, on="movieid", how="left")
tx_data.head()
views = tx_data.groupby(['movieid'])['userid'].count().reset_index()
views=views.sort_values(by='userid', ascending=False)
views=views.rename(columns={"userid": "Views"})
views=pd.merge(views,movies, on="movieid", how="right")
views.head();
plot_data = [
go.Bar(
x=views['movieid'][0:10],
y=views['Views'][0:10],
text=views["moviename"][0:10] ,textposition='inside', marker_color='lightsalmon'
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Top 10 most viewed movies',
xaxis_title="movieid",
yaxis_title="Views"
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
movierank = tx_data.groupby(['movieid'])['rating'].mean().reset_index()
movierank = pd.merge(movierank,views, on="movieid", how="right")
movierank["FinalRating"]=(movierank["rating"]/5*0.7+0.3*movierank["Views"]/np.max(movierank["Views"]))*5
movierank = movierank.sort_values(by='FinalRating', ascending=False)
movierank.head();
plot_data = [
go.Bar(
x=movierank['movieid'][0:10],
y=movierank['FinalRating'][0:10],
text=movierank["moviename"][0:10] ,textposition='inside', marker_color='lightsalmon'
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Top 10 movies with best rated and most views',
xaxis_title="movieid",
yaxis_title="Rating"
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
plot_data = [
go.Scatter(
x=movierank.query("Views>100")['Views'],
y=movierank.query("Views>100")['rating'],
mode='markers'
)
]
plot_layout = go.Layout(
title='Rating score vs. number of views',
xaxis_title="views",
yaxis_title="rating score"
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
plot_data = [
go.Scatter(
x=movierank.query("Views>100")['Views'],
y=movierank.query("Views>100")['FinalRating'],
mode='markers',
hovertext=movierank.query("Views>100")["moviename"]
)
]
plot_layout = go.Layout(
title="Final Rating (weighted averagem) vs. number of views",
xaxis_title="views",
yaxis_title="views"
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
def most_viewed_age(age1,age2):
tx_data=ratings
tx_data=pd.merge(tx_data,movies, on="movieid", how="left")
tx_data=pd.merge(tx_data,users[["userid","age"]], on="userid", how="left")
tx_age=tx_data.query(str(age1)+"<=age<="+str(age2))
views = tx_age.groupby(['movieid'])['userid'].count().reset_index()
views=views.sort_values(by='userid', ascending=False)
views=views.rename(columns={"userid": "Views"})
views=pd.merge(views,movies, on="movieid", how="right")
return views
#most_viewed_age(20,20)
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
subplot_titles=['Age 10-20', 'Age 20-30','Age 30-40','Age 40-50']
fig = make_subplots(rows=2, cols=2, specs=specs, subplot_titles=subplot_titles)
fig.add_trace(go.Pie(labels=most_viewed_age(10,20)["moviename"][0:10],
values=most_viewed_age(10,20)["Views"][0:10]), 1, 1)
fig.add_trace(go.Pie(labels=most_viewed_age(20,30)["moviename"][0:10],
values=most_viewed_age(20,30)["Views"][0:10]), 2, 1)
fig.add_trace(go.Pie(labels=most_viewed_age(30,40)["moviename"][0:10],
values=most_viewed_age(30,40)["Views"][0:10]), 1, 2)
fig.add_trace(go.Pie(labels=most_viewed_age(40,50)["moviename"][0:10],
values=most_viewed_age(40,50)["Views"][0:10]), 2, 2)
# Tune layout and hover info
fig.update_traces(hoverinfo='label+percent+name', textinfo='label', textposition='inside')
fig.update(layout_title_text='Mais vistos por idade',
layout_showlegend=False)
fig = go.Figure(fig)
fig.show()
users=pd.read_csv("users.csv",delimiter=";")
movies=pd.read_csv("movies.csv",delimiter=";",encoding = "ISO-8859-1")
ratings=pd.read_csv("ratings.csv",sep='\t', lineterminator='\r',encoding = "UTF-16 LE")
ratings['date']= pd.to_datetime(ratings['date'])
users['memberfor']= pd.to_datetime(users['memberfor'],format='%d/%m/%Y %H:%M')
ratings.drop([8196077],inplace=True)
#print(ratings.isnull().sum())
ratings.isnull().sum()
ratings['movieid']=pd.to_numeric(ratings['movieid'],downcast='integer')
ratings['userid']=pd.to_numeric(ratings['userid'],downcast='integer')
###function that filters the movieids and names to have an ordered list of the most rated/seen movies in a time
#window starting in date 1 and ending in date 2
def pop_by_data(date1,date2):
d1=pd.to_datetime(date1, format='%d/%m/%Y')
d2=pd.to_datetime(date2, format='%d/%m/%Y')
mask = (ratings['date'] >= d1) & (ratings['date'] <= d2)
tx_data=ratings.loc[mask]
tx_data=pd.merge(tx_data,movies, on="movieid", how="left")
views = tx_data.groupby(['movieid'])['userid'].count().reset_index()
views=views.sort_values(by='userid', ascending=False)
views=views.rename(columns={"userid": "Views"})
views=pd.merge(views,movies, on="movieid", how="left")
return views
%timeit pop_by_data('01/05/2007','01/06/2007')
pop_by_data('01/05/2007','01/06/2007')
###this function returns a dataframe that tell us if a movie was watched by
'''
def pivot(date1,date2):
d2=pd.to_datetime(date2, format='%d/%m/%Y')
tx_user = ratings.loc[ratings['date'] <= d2]
tx_user=tx_user[tx_user.movieid.isin(pop_by_data(date1,date2)["movieid"][0:10])]
index=pd.pivot_table(tx_user, index='userid', columns='movieid', values='rating')
return index
#%timeit pivot('01/05/2007','01/06/2007')
pivot('01/05/2007','01/06/2007')
'''
def revert_pivot(date1,date2):
d2=pd.to_datetime(date2, format='%d/%m/%Y')
tx_user = ratings.loc[ratings['date'] <= d2]
tx_user=tx_user[tx_user.movieid.isin(pop_by_data(date1,date2)["movieid"][0:50])]
index=pd.pivot_table(tx_user, index='movieid', columns='userid', values='rating')
new_order=pop_by_data(date1,date2)["movieid"][0:50].tolist()
index=index.reindex(new_order)
return index
rp=revert_pivot('01/05/2007','01/06/2007')
rp.columns
def r_lista(userid,rp,n):
return rp[np.isnan(rp[userid])].index.tolist()[0:n]
def table(date1,date2,N):
rp=revert_pivot(date1,date2)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista, excluded=["rp","n"], otypes=[list])
return pd.DataFrame(data={"userid":users, "recommended_p_movieids":vfunc(users,rp=rp,n=N)})
def revert_pivot_after(date1,date2,date3):
d2=pd.to_datetime(date2, format='%d/%m/%Y')
d3=pd.to_datetime(date3, format='%d/%m/%Y')
tx_user = ratings.loc[(ratings['date'] <= d3) & (ratings['date'] >= d2)]
tx_user=tx_user[tx_user.movieid.isin(pop_by_data(date1,date2)["movieid"][0:50])]
index=pd.pivot_table(tx_user, index='movieid', columns='userid', values='rating')
new_order=pop_by_data(date1,date2)["movieid"][0:50].tolist()
index=index.reindex(new_order)
return index
def r_lista_after(userid,rp):
return rp[np.isnan(rp[userid])!=True].index.tolist()
def table_after(date1,date2,date3):
rp=revert_pivot_after(date1,date2,date3)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista_after, excluded=["rp"], otypes=[list])
return pd.DataFrame(data={"userid":users, "seen_foll_month":vfunc(users,rp=rp)})
verif=table_after('01/05/2007','01/06/2007','01/07/2007')
def is_true(userid,r):
u=pd.DataFrame(data={"movies": r.loc[userid,"seen_foll_month"]+r.loc[userid,"recommended_p_movieids"]})
return len(u[u.duplicated()])
def label(r):
users=r.dropna(subset=['seen_foll_month'])["userid"].to_numpy()
vfunc = np.vectorize(is_true, excluded=["r"], otypes=[int])
r_index=r.set_index('userid')
num=vfunc(users,r=r_index)
label=pd.DataFrame(data={"userid":users, "n_success_recommendations":num})
r=pd.merge(r,label,on='userid',how='left')
r["n_success_recommendations"].fillna(0,inplace=True)
return r
def K_recomendacoes_populares(date1,date2,date3,N):
recomendas=table(date1,date2,N)
verif=table_after(date1,date2,date3)
recomendas=pd.merge(recomendas,verif,on='userid',how='left')
recomendas=label(recomendas)
recomendas["seen_foll_month"].fillna(0,inplace=True)
recomendas["recomlen"]=recomendas["recommended_p_movieids"].apply(len)
recomendas["nºvistas"]=recomendas["seen_foll_month"].apply(lambda x: len(x) if x!=0 else 0)
recomendas["recall"]=recomendas["n_success_recommendations"]/recomendas["nºvistas"]
recomendas["recall"].fillna(0,inplace=True)
return recomendas
r=K_recomendacoes_populares('01/05/2007','01/06/2007','01/07/2007',5)
r.iloc[595:599]
precision=[]
percentage=[]
n=[]
recall=[]
f_score=[]
for i in [3,5,10,20]:
n.append(i)
r=K_recomendacoes_populares('01/05/2007','01/06/2007','01/07/2007',i)
precision.append(str(np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/i)*100)[0:4]+"%")
recall.append(str(np.mean(r.query("seen_foll_month != 0")["recall"])*100)[0:4]+"%")
p=np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/i)*100
R=np.mean(r.query("seen_foll_month != 0")["recall"])*100
f_score.append(str(2*p*R/(R+p))[0:4]+"%")
percentage.append(str(r.query("n_success_recommendations > 0")["n_success_recommendations"].count()/r.query("seen_foll_month != 0")["seen_foll_month"].count()*100)[0:4]+"%")
measures = pd.DataFrame(data={"Precision":precision,"Recall":recall,"F1":f_score,"% of users that watched 1 or more recommendations":percentage, "n of recommendations":n})
measures = measures.set_index('n of recommendations')
measures
def revert_pivot_after(date1,date2,date3):
d2=pd.to_datetime(date2, format='%d/%m/%Y')
d3=pd.to_datetime(date3, format='%d/%m/%Y')
tx_user = ratings.loc[(ratings['date'] <= d3) & (ratings['date'] >= d2)]
tx_user=tx_user[tx_user.movieid.isin(pop_by_data(date1,date2)["movieid"][0:50])]
index=pd.pivot_table(tx_user, index='movieid', columns='userid', values='rating')
new_order=pop_by_data(date1,date2)["movieid"][0:50].tolist()
index=index.reindex(new_order)
return index
def r_lista_after(userid,rp):
return rp[np.isnan(rp[userid])!=True].index.tolist()
def table_after(date1,date2,date3):
rp=revert_pivot_after(date1,date2,date3)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista_after, excluded=["rp"], otypes=[list])
return pd.DataFrame(data={"userid":users, "seen_foll_month":vfunc(users,rp=rp)})
verif=table_after('01/05/2007','01/06/2007','01/07/2007')
def is_true(userid,r):
u=pd.DataFrame(data={"movies": r.loc[userid,"seen_foll_month"]+r.loc[userid,"recommended_p_movieids"]})
return len(u[u.duplicated()])
def label(r):
users=r.dropna(subset=['seen_foll_month'])["userid"].to_numpy()
vfunc = np.vectorize(is_true, excluded=["r"], otypes=[int])
r_index=r.set_index('userid')
num=vfunc(users,r=r_index)
label=pd.DataFrame(data={"userid":users, "n_success_recommendations":num})
r=pd.merge(r,label,on='userid',how='left')
r["n_success_recommendations"].fillna(0,inplace=True)
return r
def K_recomendacoes_populares(date1,date2,date3,N):
recomendas=table(date1,date2,N)
verif=table_after(date1,date2,date3)
recomendas=pd.merge(recomendas,verif,on='userid',how='left')
recomendas=label(recomendas)
recomendas["seen_foll_month"].fillna(0,inplace=True)
recomendas["recomlen"]=recomendas["recommended_p_movieids"].apply(len)
recomendas["nºvistas"]=recomendas["seen_foll_month"].apply(lambda x: len(x) if x!=0 else 0)
recomendas["recall"]=recomendas["n_success_recommendations"]/recomendas["nºvistas"]
recomendas["recall"].fillna(0,inplace=True)
return recomendas
def pop_by_data_age(date1,date2,age1,age2):
d1=pd.to_datetime(date1, format='%d/%m/%Y')
d2=pd.to_datetime(date2, format='%d/%m/%Y')
mask = (ratings['date'] >= d1) & (ratings['date'] <= d2)
tx_data=ratings.loc[mask]
tx_data=pd.merge(tx_data,movies, on="movieid", how="left")
tx_data=pd.merge(tx_data,users, on="userid", how="left")
mask2 = (tx_data["age"] >= age1) & (tx_data["age"] < age2)
tx_data = tx_data.loc[mask2]
views = tx_data.groupby(['movieid'])['userid'].count().reset_index()
views=views.sort_values(by='userid', ascending=False)
views=views.rename(columns={"userid": "Views"})
views=pd.merge(views,movies, on="movieid", how="left")
return views
def revert_pivot_age(date1,date2,age1,age2):
d2=pd.to_datetime(date2, format='%d/%m/%Y')
tx_user = ratings.loc[ratings['date'] <= d2]
tx_user = pd.merge(tx_user,users, on="userid", how="left")
mask2 = (tx_user["age"] >= age1) & (tx_user["age"] < age2)
tx_user = tx_user.loc[mask2]
tx_user=tx_user[tx_user.movieid.isin(pop_by_data_age(date1,date2,age1,age2)["movieid"][0:50])]
index=pd.pivot_table(tx_user, index='movieid', columns='userid', values='rating')
new_order=pop_by_data_age(date1,date2,age1,age2)["movieid"][0:50].tolist()
index=index.reindex(new_order)
return index
def table_age(date1,date2,N,age1,age2):
rp=revert_pivot_age(date1,date2,age1,age2)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista, excluded=["rp","n"], otypes=[list])
return pd.DataFrame(data={"userid":users, "recommended_p_movieids":vfunc(users,rp=rp,n=N)})
def K_recomendacoes_populares_age(date1,date2,date3,N):
recomendas=table_age(date1,date2,N,5,15)
for i in [[15,25],[25,35],[35,45],[45,55],[55,65]]:
recomendas= recomendas.append(table_age(date1,date2,N,i[0],i[1]), ignore_index=True)
recomendas.drop_duplicates(subset ="userid", keep = "first", inplace = True)
verif=table_after(date1,date2,date3)
recomendas=pd.merge(recomendas,verif,on='userid',how='left')
recomendas=label(recomendas)
recomendas["seen_foll_month"].fillna(0,inplace=True)
recomendas["nºvistas"]=recomendas["seen_foll_month"].apply(lambda x: len(x) if x!=0 else 0)
recomendas["recall"]=recomendas["n_success_recommendations"]/recomendas["nºvistas"]
recomendas["recall"].fillna(0,inplace=True)
return recomendas
precision=[]
percentage=[]
n=[]
recall=[]
f_score=[]
for i in [3,5,10,20]:
n.append(i)
r=K_recomendacoes_populares_age('01/05/2007','01/06/2007','01/07/2007',i)
precision.append(str(np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/i)*100)[0:4]+"%")
recall.append(str(np.mean(r.query("seen_foll_month != 0")["recall"])*100)[0:4]+"%")
p=np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/i)*100
R=np.mean(r.query("seen_foll_month != 0")["recall"])*100
f_score.append(str(2*p*R/(R+p))[0:4]+"%")
percentage.append(str(r.query("n_success_recommendations > 0")["n_success_recommendations"].count()/r.query("seen_foll_month != 0")["n_success_recommendations"].count()*100)[0:4]+"%")
pop_by_data_age('01/05/2007','01/06/2007',30,50)
recomendas=table_age('01/05/2007','01/06/2007',10,20,30)
revert_pivot_age('01/05/2007','01/06/2007',30,50)
measures = pd.DataFrame(data={"Precision":precision,"Recall":recall,"F1 score":f_score,"% of users that watched 1 or more recommendations":percentage, "n of recommendations":n})
measures = measures.set_index('n of recommendations')
print("Table of performance measures for popularity model considering age groups")
measures
users=pd.read_csv("users.csv",delimiter=";")
movies=pd.read_csv("movies.csv",delimiter=";",encoding = "ISO-8859-1")
ratings=pd.read_csv("ratings.csv",sep='\t', lineterminator='\r',encoding = "UTF-16 LE")
ratings['date']= pd.to_datetime(ratings['date'])
users['memberfor']= pd.to_datetime(users['memberfor'],format='%d/%m/%Y %H:%M')
ratings.drop([8196077],inplace=True)
ratings.isnull().sum()
ratings['movieid']=pd.to_numeric(ratings['movieid'],downcast='integer')
ratings['userid']=pd.to_numeric(ratings['userid'],downcast='integer')
from mlxtend.preprocessing import TransactionEncoder
def revert_pivot_v(date1,date2):
d1=pd.to_datetime(date1, format='%d/%m/%Y')
d2=pd.to_datetime(date2, format='%d/%m/%Y')
mask = (ratings['date'] >= d1) & (ratings['date'] <= d2)
tx_user=ratings.loc[mask]
min_movie_ratings = 0
filter_movies = tx_user['movieid'].value_counts() > min_movie_ratings
filter_movies = pop_by_data(date1, date2)["movieid"][0:50]
min_user_ratings = 0
filter_users = tx_user['userid'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()
ratings_new = tx_user[(tx_user['movieid'].isin(filter_movies)) & (tx_user['userid'].isin(filter_users))]
index=pd.pivot_table(ratings_new, index='movieid', columns='userid', values='rating')
return index
def viram(userid,rp):
return rp[np.isnan(rp[userid])!=True].index.tolist()
def table_viram(date1,date2):
rp=revert_pivot_v(date1,date2)
users=rp.columns.to_numpy()
vfunc = np.vectorize(viram, excluded=["rp"], otypes=[list])
return pd.DataFrame(data={"userid":users, "seen":vfunc(users,rp=rp)})
trans=table_viram('01/05/2007','01/06/2007')
dataset=trans["seen"].tolist()
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
trans.iloc[:3]
print(len(trans))
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.03, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets
frequent_itemsets=frequent_itemsets.sort_values(by="length",ascending=False)
frequent_itemsets.head()
from mlxtend.frequent_patterns import association_rules
rules=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)
rules=rules.sort_values(by=["confidence","support","lift"],ascending=[False,False,False])
rules.head()
def viram2(userid,rp):
return set(rp[np.isnan(rp[userid])!=True].index.tolist())
def viram_same(date1,date2):
p=revert_pivot(date1,date2)
users=p.columns.to_numpy()
vfunc = np.vectorize(viram2, excluded=["rp"], otypes=[set])
return pd.DataFrame(data={"userid":users, "viu":vfunc(users,rp=p)})
rec_a_rules = viram_same('01/05/2007','01/06/2007')
rec_a_rules=rec_a_rules.set_index('userid')
rules[((rules['antecedents'] >= rec_a_rules.loc[84,'viu']) | (rules['antecedents'] <= rec_a_rules.loc[84,'viu'])) & ((rules["consequents"]-rec_a_rules.loc[84,'viu'])==rules["consequents"])].groupby(["consequents"]).max()
def conf(userid,rules,rec_a_rules,k):
re=pd.DataFrame(data=set(rules[( ( rules['antecedents'] >= rec_a_rules.loc[userid,'viu'] ) | ( rules['antecedents'] <= rec_a_rules.loc[userid,'viu'] ) ) & ( (rules["consequents"]-rec_a_rules.loc[userid,'viu'])==rules["consequents"] )].groupby(["consequents"]).max().index.to_list()[0:k]))
n=len(re.columns)
if (n!=0):
data=re[0].to_list()
for i in range(1,n):
data+=re[i].to_list()
re2=pd.DataFrame(data=data)
re2=re2.drop_duplicates(keep='first')
re2=re2.dropna()
return re2[0].to_list()
else:
return []
def rec_rules(rules,rec_a_rules,N):
users=rec_a_rules.index.to_numpy()
vfunc = np.vectorize(conf, excluded=["rules","rec_a_rules","k"], otypes=[list])
return pd.DataFrame(data={"userid":users, "recomendas":vfunc(users,rules=rules,rec_a_rules=rec_a_rules,k=N)})
def r_lista_after(userid,rp):
return rp[np.isnan(rp[userid])!=True].index.tolist()
def table_after(date1,date2,date3):
rp=revert_pivot_after(date1,date2,date3)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista_after, excluded=["rp"], otypes=[list])
return pd.DataFrame(data={"userid":users, "seen_foll_month":vfunc(users,rp=rp)})
def revert_pivot_after2(date1,date2,date3):
d2=pd.to_datetime(date2, format='%d/%m/%Y')
d3=pd.to_datetime(date3, format='%d/%m/%Y')
tx_user = ratings.loc[(ratings['date'] <= d3) & (ratings['date'] >= d2)]
tx_user=tx_user[tx_user.movieid.isin(revert_pivot_v(date2,date3).index)]
index=pd.pivot_table(tx_user, index='movieid', columns='userid', values='rating')
return index
def table_after_2(date1,date2,date3):
rp=revert_pivot_after2(date1,date2,date3)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista_after, excluded=["rp"], otypes=[set])
return pd.DataFrame(data={"userid":users, "seen_foll_month":vfunc(users,rp=rp)})
def is_true2(userid,r):
u=pd.DataFrame(data={"movies": r.loc[userid,"seen_foll_month"]+r.loc[userid,"recomendas"]})
return len(u[u.duplicated()])
def label2(r):
users=r.dropna(subset=['seen_foll_month'])["userid"].to_numpy()
vfunc = np.vectorize(is_true2, excluded=["r"], otypes=[int])
r_index=r.set_index('userid')
num=vfunc(users,r=r_index)
label=pd.DataFrame(data={"userid":users, "n_success_recommendations":num})
r=pd.merge(r,label,on='userid',how='left')
r["n_success_recommendations"].fillna(0,inplace=True)
return r
def K_recomendacoes_associa(date1,date2,date3,k):
trans=table_viram(date1,date2)
dataset=trans["seen"].tolist()
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.025, use_colnames=True)
rules=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules=rules.sort_values(by=["confidence","support","lift"],ascending=[False,False,False])
rec_a_rules = viram_same(date1,date2)
rec_a_rules=rec_a_rules.set_index('userid')
recs= rec_rules(rules,rec_a_rules,k)
verif=table_after_2('01/05/2007','01/06/2007','01/07/2007')
recs2=pd.merge(recs,verif,on='userid',how='left')
recs2=label2(recs2)
recs2["seen_foll_month"].fillna(0,inplace=True)
recs2["recomlen"]=recs2["recomendas"].apply(len)
recs2["P"]=recs2["n_success_recommendations"]/recs2["recomlen"]
recs2["P"].fillna(0,inplace=True)
recs2["n_watched"]=recs2["seen_foll_month"].apply(lambda x: len(x) if x!=0 else 0)
recs2["R"]=recs2["n_success_recommendations"]/recs2["n_watched"]
recs2["R"].fillna(0,inplace=True)
return recs2
recomendas= K_recomendacoes_associa('01/05/2007','01/06/2007','01/07/2007',10)
recomendas.head()
precision=[]
percentage=[]
n=[]
recall=[]
f_score=[]
for i in [3,5,10,20]:
n.append(i)
r=K_recomendacoes_associa('01/05/2007','01/06/2007','01/07/2007',i)
precision.append(str(np.mean(r.query("seen_foll_month != 0")["P"])*100)[0:4]+"%")
recall.append(str(np.mean(r.query("seen_foll_month != 0")["R"])*100)[0:4]+"%")
p=np.mean(r.query("seen_foll_month != 0")["P"])*100
R=np.mean(r.query("seen_foll_month != 0")["R"])*100
f_score.append(str(2*p*R/(R+p))[0:4]+"%")
percentage.append(str(r.query("n_success_recommendations > 0")["n_success_recommendations"].count()/r.query("seen_foll_month != 0")["n_success_recommendations"].count()*100)[0:4]+"%")
measures = pd.DataFrame(data={"Precision":precision,"Recall":recall,"F1":f_score,"% of users that watched 1 or more recommendations":percentage, "n of rules":n})
measures = measures.set_index('n of rules')
measures
For this model we decided to use a scikit package that is specialized in building and analyzing recommendation systems called surprise. This way it was possible to implement several prediction models ready to use with different similarity measures both user and item based.
Before using building any model, we first did some analysis of the number of ratings per movie and per user distributions by plotting their histograms. This can help us understands which are the best thresholds we should use to determine if a movie or user are relevant or not, this is, the number of interactions they should have in order not be discarded from the final dataset we are going to be working with. This step is very important because it provides the possibility of reducing the dimensionality of our data and therefore speeds up our processing and running times. The resulting histograms are available in the interactive images of our collab notebook version.
After choosing to discard users and movies with less than a thousand ratings in total, we begin looking at the time period of the data that our systems will have access to. We decided we were going to use any ratings from the beginning of the year 2006 to the end of the first semester of 2007. Following this, we had to use a reader to load our pre-processed dataframe in the correct format for the package to use it. And then we divided the data into training and test sets as usual, using the ratio 75%/25%, respectively and created a list of models from the surprise with different measures and ways of calculating the predictions.
For each of the models (KNNBasic and KNNWithZScore) we tried 6 optinios with the possible combinations of similarity measures (cosine,msd,pearson) and for item and user based recommendations. After this we computed the predictions and compared the results based on the Root mean squared error to choose the model with the best performance. As we can see in the following table the first 2 models are the ones with lower error, but are very close together, we decided to work with the user-based KNNWithZScore model that uses person correlation as the similarity measure.
ratings=pd.read_csv("ratings.csv",sep='\t', lineterminator='\r',encoding = "UTF-16 LE")
ratings['date']= pd.to_datetime(ratings['date'])
ratings.drop([8196077],inplace=True)
ratings2 = ratings.copy()
max_date = datetime(2007, 6, 1)
ratings2 = ratings2.loc[ratings['date'] <= max_date]
min_movie_ratings = 1000
filter_movies = ratings2['movieid'].value_counts() > min_movie_ratings
filter_movies = filter_movies[filter_movies].index.tolist()
min_user_ratings = 1000
filter_users = ratings2['userid'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()
ratings_new = ratings2[(ratings2['movieid'].isin(filter_movies)) & (ratings2['userid'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(ratings2.shape[0]))
print('The new data frame shape:\t{}'.format(ratings_new.shape[0]))
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic,KNNWithMeans,KNNWithZScore
from surprise.model_selection import train_test_split
from surprise import accuracy
min_date = datetime(2006, 1, 1)
def filter_by_date(min_date,max_date,table):
return table.loc[(table['date'] >= min_date) & (table['date'] <= max_date)]
ratings_new = filter_by_date(min_date,max_date,ratings_new)
ratings_new.movieid = ratings_new.movieid.astype(int)
n_users = ratings_new.userid.unique().shape[0]
n_items = ratings_new.movieid.unique().shape[0]
n_rows = ratings_new.shape[0]
print("Number of Rows: {}, Number of users: {} , Number of movies: {}".format(n_rows, n_users, n_items))
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_new[['userid', 'movieid', 'rating']], reader)
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)
algo_list = []
# Algorithm parameters to use
sim_options1 = {'name': 'cosine',
'user_based': False #Item-based cosine similarity
}
sim_options2 = {'name': 'msd',
'user_based': False #Item-based msd similarity
}
sim_options3 = {'name': 'pearson',
'user_based': False #Item-based pearson correlation coeficient similarity
}
sim_options4 = {'name': 'cosine',
'user_based': True #User-based cosine similarity
}
sim_options5 = {'name': 'msd',
'user_based': True #User-based msd similarity
}
sim_options6 = {'name': 'pearson',
'user_based': True #User-based pearson correlation coeficient similarity
}
algo_list.append(KNNBasic(sim_options=sim_options1))
algo_list.append(KNNBasic(sim_options=sim_options2))
algo_list.append(KNNBasic(sim_options=sim_options3))
algo_list.append(KNNBasic(sim_options=sim_options4))
algo_list.append(KNNBasic(sim_options=sim_options5))
algo_list.append(KNNBasic(sim_options=sim_options6))
algo_list.append(KNNWithMeans(sim_options=sim_options1))
algo_list.append(KNNWithMeans(sim_options=sim_options2))
algo_list.append(KNNWithMeans(sim_options=sim_options3))
algo_list.append(KNNWithMeans(sim_options=sim_options4))
algo_list.append(KNNWithMeans(sim_options=sim_options5))
algo_list.append(KNNWithMeans(sim_options=sim_options6))
algo_list.append(KNNWithZScore(sim_options=sim_options1))
algo_list.append(KNNWithZScore(sim_options=sim_options2))
algo_list.append(KNNWithZScore(sim_options=sim_options3))
algo_list.append(KNNWithZScore(sim_options=sim_options4))
algo_list.append(KNNWithZScore(sim_options=sim_options5))
algo_list.append(KNNWithZScore(sim_options=sim_options6))
input_rows = []
# Iterate over all algorithms
for algorithm in algo_list:
# Train the algorithm on the trainset, and predict ratings for the testset
algorithm.fit(trainset)
predictions = algorithm.test(testset)
# Then compute RMSE
result = accuracy.rmse(predictions,verbose=False)
#Get algotithm parameters
similarity = algorithm.sim_options['name']
base = algorithm.sim_options['user_based']
if base:
base = 'user_based'
else:
base = 'item_based'
#Add algorithm full name and result to input_rows list
input_rows.append((str(algorithm).split(' ')[0].split('.')[-1]+"/"+similarity+"/"+base,result))
rows_list = []
for row in input_rows:
dict1 = {}
# get input row in dictionary format
# key = col_name
dict1.update(Algorithm = row[0], RMSE = row[1])
rows_list.append(dict1)
performance_compare = pd.DataFrame(rows_list,columns=['Algorithm','RMSE'])
performance_compare.set_index('Algorithm').sort_values('RMSE')
def predict_rank(movieid,userid,model):
preds = model.predict(userid,movieid) #estimar o ranting que o user daria
return preds[3] # retorna o rating estimado
def r_lista_3(userid,rp,k,model):
vfunc2 = np.vectorize(predict_rank, excluded=["userid","movielist","ranklist","model"], otypes=[list])
v2 = rp[np.isnan(rp[userid])].index.to_numpy()
table = pd.DataFrame(data={"movieid": v2, "rank": vfunc2(v2, userid=userid,model=model)}) #para cada filme dos 50 populares que o user não viu obtenho uma tabela com o filme e o rating estimado para esse par user-filme
table = table.sort_values(by="rank", ascending=False) #organizo a tabela por ranting estimado
return table["movieid"].to_list()[0:k] #devolvo os k filmes com rating estimado mais alto
def table_3(date1,date2,n,model):
rp=revert_pivot(date1,date2)
users=rp.columns.to_numpy()
vfunc = np.vectorize(r_lista_3, excluded=["rp","k","model"], otypes=[list])
return pd.DataFrame(data={"userid":users, "recomRank":vfunc(users,rp=rp,k=n,model=model)}) #obtenho a lista de filmes recomendados para cada user organizada pela estimativa de rating
#recomendas=table_3('01/05/2007','01/06/2007',10)
#recomendas=table_3('01/05/2007','01/06/2007',10)
#verif=table_after('01/05/2007','01/06/2007','01/07/2007')
#revert_pivot_after('01/05/2007','01/06/2007','01/07/2007').head()
#recomendas=pd.merge(recomendas,verif,on='userid',how='left')
#recomendas.head()
def is_true_3(userid,r):
u=pd.DataFrame(data={"movies": r.loc[userid,"seen_foll_month"]+r.loc[userid,"recomRank"]})
return len(u[u.duplicated()])
def label_3(r):
users=r.dropna(subset=['seen_foll_month'])["userid"].to_numpy()
vfunc = np.vectorize(is_true_3, excluded=["r"], otypes=[int])
r_index=r.set_index('userid')
num=vfunc(users,r=r_index)
label=pd.DataFrame(data={"userid":users, "n_success_recommendations":num})
r=pd.merge(r,label,on='userid',how='left')
r["n_success_recommendations"].fillna(0,inplace=True)
return r
#recomendas=label_3(recomendas)
#recomendas.head()
model = algo_list[-1]
model.fit(trainset)
print(model.predict(288,49294.0))
ratings.query('userid == 288 & movieid == 49294.0')
def K_recomendacoes_ranking(date1,date2,date3,N,algo):
ratings2 = ratings.copy()
max_date = datetime(2007, 6, 1)
ratings2 = ratings2.loc[ratings['date'] <= max_date]
min_movie_ratings = 1000
filter_movies = ratings2['movieid'].value_counts() > min_movie_ratings
filter_movies = filter_movies[filter_movies].index.tolist()
min_user_ratings = 1000
filter_users = ratings2['userid'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()
ratings_new = ratings2[(ratings2['movieid'].isin(filter_movies)) & (ratings2['userid'].isin(filter_users))]
min_date = datetime(int(date1[6:10]), int(date1[3:5]), int(date1[0:2]))
max_date = datetime(int(date2[6:10]), int(date2[3:5]), int(date2[0:2]))
def filter_by_date(min_date,max_date,table):
return table.loc[(table['date'] >= min_date) & (table['date'] <= max_date)]
ratings_new = filter_by_date(min_date,max_date,ratings_new)
ratings_new.movieid = ratings_new.movieid.astype(int)
n_users = ratings_new.userid.unique().shape[0]
n_items = ratings_new.movieid.unique().shape[0]
n_rows = ratings_new.shape[0]
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_new[['userid', 'movieid', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)
model = algo
model.fit(trainset)
recomendas=table_3(date1,date2,N,model)
verif=table_after(date1,date2,date3)
recomendas=pd.merge(recomendas,verif,on='userid',how='left')
recomendas=label_3(recomendas)
recomendas["seen_foll_month"].fillna(0,inplace=True)
recomendas["recomlen"]=recomendas["recomRank"].apply(len)
recomendas["nºvistas"]=recomendas["seen_foll_month"].apply(lambda x: len(x) if x!=0 else 0)
recomendas["recall"]=recomendas["n_success_recommendations"]/recomendas["nºvistas"]
recomendas["recall"].fillna(0,inplace=True)
return recomendas
K_recomendacoes_ranking('01/05/2007','01/06/2007','01/07/2007',10,algo_list[-1]).head()
precision=[]
percentage=[]
n=[]
recall=[]
f_score=[]
for i in [3,5,10,20]:
n.append(i)
r=K_recomendacoes_ranking('01/05/2007','01/06/2007','01/07/2007',i,algo_list[-1])
precision.append(str(np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/i)*100)[0:4]+"%")
recall.append(str(np.mean(r.query("seen_foll_month != 0")["recall"])*100)[0:4]+"%")
p=np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/i)*100
R=np.mean(r.query("seen_foll_month != 0")["recall"])*100
f_score.append(str(2*p*R/(R+p))[0:4]+"%")
percentage.append(str(r.query("n_success_recommendations > 0")["n_success_recommendations"].count()/r.query("seen_foll_month != 0")["n_success_recommendations"].count()*100)[0:4]+"%")
measures = pd.DataFrame(data={"Precisão":precision,"Recall":recall,"F1":f_score,"Percentagem de users que viram 1 das recomendações":percentage, "Nº recomendações":n})
measures = measures.set_index('Nº recomendações')
measures
def measures_calculator(r):
precision = str(np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/r.query("seen_foll_month != 0")["recomlen"])*100)[0:4]
recall = str(np.mean(r.query("seen_foll_month != 0")["recall"])*100)[0:4]
p= np.mean(r.query("seen_foll_month != 0")["n_success_recommendations"]/r.query("seen_foll_month != 0")["recomlen"])*100
R= np.mean(r.query("seen_foll_month != 0")["recall"])*100
f_score = str(2*p*R/(R+p))[0:4]
return precision, recall, f_score
P_r , P_p , P_f = [] , [] , []
CF_r , CF_p , CF_f = [] , [] , []
data=[]
months=['01','02','03','04','05','06','07','08']
for i in range(len(months)-2):
date1='01/'+months[i]+'/2007'
date2='01/'+months[i+1]+'/2007'
date3='01/'+months[i+2]+'/2007'
date='01/'+months[i+1]+'/2007'
data.append(date)
p1=measures_calculator(K_recomendacoes_populares(date1, date2,date3, 5))
p2=measures_calculator(K_recomendacoes_ranking(date1, date2,date3, 10, algo_list[-1]))
P_p.append(p1[0])
P_r.append(p1[1])
P_f.append(p1[2])
CF_p.append(p2[0])
CF_r.append(p2[1])
CF_f.append(p2[2])
for i in range(len(P_r)):
CF_r[i]=float(CF_r[i])
CF_p[i]=float(CF_p[i])
CF_f[i]=float(CF_f[i])
P_r[i]=float(P_r[i])
P_p[i]=float(P_p[i])
P_f[i]=float(P_f[i])
CF_p=np.array(CF_p)
CF_r=np.array(CF_r)
CF_f=np.array(CF_f)
P_p=np.array(P_p)
P_r=np.array(P_r)
P_f=np.array(P_f)
print('average precision of collaborative filtering: ', np.mean(CF_p))
print('average precision of popularity: ', np.mean(P_p))
print('average recall of collaborative filtering: ', np.mean(CF_r))
print('average recall of popularity: ', np.mean(P_r))
print('average f1 score of collaborative filtering: ', np.mean(CF_f))
print('average f1 score of popularity: ', np.mean(P_f))
In this project we were able to build recommendation systems based on 3 different approaches: Popularity, Association Rules and both item and user based Collaborative Filtering.
Two different popularity models have been implemented, one simpler that gives a personalized list for the most watched movies and one recommending system that also considers age groups. For both implementations the results were satisfactory for this task and the performance measures were very similar. We achieved the best precision when giving 3 recommendations (around 25%) and the recall increases with the number of recommendations.
The association rules algorithm was the one that performed the worst, the best precision was again for using only 3 rules and achieved only 10%. The recall also increases with the number of recommendations (and rules), but was much lower than the one in the popularity model. Our best f-score was only 12.6% for 20 rules and even for this high number of recommendations more than 50% of the users did not watch any of the movies recommended.
The last approach to implement was Collaborative Filtering, we tried several different combinations of methods for calculating the nearest neighbors, similarity measures and used both item and user based recommendations. After testing the list of models the one we decided that worked best was the user-based KNNWithZScore that uses person correlation as the similarity measure. For the same time period and data as the previous algorithms this model achieved a slightly lower precision, but more stable, than the popularity approach, a f1 score around 20% that does not change much with the number of recommendations because of the also increasing recall.
In order to better compare our models we decided to compute recommendations and test their performance throughout a time window in the year of 2007 (where we have higher volume of user ratings) and calculate their means. The average precision for the collaborative model was 14.5% and for popularity it was 17.1%. The f1 score was around 21% for both models but the collaborative filtering clearly outperformed popularity in terms of recall with and average of 36%.
After this we wanted to created a more sophisticated implementation of a recommending system that would be used in a production environment. Therefore we created a form in our dynamic interactive report that generates recommendations according to the time window and approach of your choice within the ones developed in this project. This application can be accessed at the following link: https://colab.research.google.com/drive/1U9JTlHfXqQMcfnSSMu2v95gZN6Lj_3mN?usp=sharing.
For future reference we would like to study some methods that we think could potentially boost our models' performances. The first thing we could implement is link analysis and community discovery to help us understand the relationships between users and also movies. After having this information we could build a more complex model that would know which approach between Collaborative Filtering, Popularity and Association Rules it should use to generate recommendations for a particular user.
As a final note we would like to mention that in order to fully and correctly test our models in a real world application, it would probably be more suitable to implement a A-B test. In this method we select a percentage of the population of users to give recommendations and after we evaluate the impact of the recommendations in that group of users in comparison to the one which were not given any suggestions.
from IPython.display import HTML
display(HTML('<style>.prompt{width: 0px; min-width: 0px; visibility: collapse}</style>'))
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')